{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/smueller/miniconda3/envs/python-3.11.5/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from transformers import pipeline\n",
    "import torch\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# select GPU if available\n",
    "\n",
    "# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "# use for Mac Studio M2\n",
    "device = \"mps\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['document_id', 'text', 'year', 'fulldate', 'harmonised', 'ownership',\n",
       "       'ownership_clean', 'ownership_3', 'ownership_2', 'ownership_2_lag',\n",
       "       'propnum', 'election_year', 'seniority', 'constituency_name',\n",
       "       'dublin_dummy', 'party', 'party_recoded', 'party_broad', 'name',\n",
       "       'author_id', 'id', 'in_reply_to_user_id', 'created_at',\n",
       "       'conversation_id', 'lang', 'public_metrics', 'date', 'mpname', 'land',\n",
       "       'candidate', 'first_pref_share', 'gender', 'district_magnitude_man',\n",
       "       'running_sum', 'elected_sum', 'term', 'housing_committee', 'info',\n",
       "       'housing_pos', 'county_recoded', 'n_houses', 'mean_price',\n",
       "       'median_price', 'mean_price_lag', 'median_price_lag',\n",
       "       'perc_change_mean', 'perc_change_median', 'change_ownership',\n",
       "       'change_became_owner', 'gov_opp', 'type', 'title', 'respondent',\n",
       "       'birth', 'housing_bert', 'score'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load full datast\n",
    "df_all = pd.read_parquet(\"data_dontshare/data_analysis_classified_housing.parquet\")\n",
    "\n",
    "# get column names\n",
    "df_all.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>document_id</th>\n",
       "      <th>text</th>\n",
       "      <th>year</th>\n",
       "      <th>fulldate</th>\n",
       "      <th>harmonised</th>\n",
       "      <th>ownership</th>\n",
       "      <th>ownership_clean</th>\n",
       "      <th>ownership_3</th>\n",
       "      <th>ownership_2</th>\n",
       "      <th>ownership_2_lag</th>\n",
       "      <th>...</th>\n",
       "      <th>perc_change_median</th>\n",
       "      <th>change_ownership</th>\n",
       "      <th>change_became_owner</th>\n",
       "      <th>gov_opp</th>\n",
       "      <th>type</th>\n",
       "      <th>title</th>\n",
       "      <th>respondent</th>\n",
       "      <th>birth</th>\n",
       "      <th>housing_bert</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>adams gerry_2014_tweets_6</td>\n",
       "      <td>Government must get Terms of Reference on moth...</td>\n",
       "      <td>2014</td>\n",
       "      <td>2014-06-11</td>\n",
       "      <td>adams gerry</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Personal residence</td>\n",
       "      <td>Homeowner Only</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>...</td>\n",
       "      <td>3.118387</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Opposition</td>\n",
       "      <td>Tweets</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1948</td>\n",
       "      <td>Housing</td>\n",
       "      <td>0.929408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>476</th>\n",
       "      <td>adams gerry_2018_tweets_264</td>\n",
       "      <td>Homes For All! https://t.co/OAzypEGSop</td>\n",
       "      <td>2018</td>\n",
       "      <td>2018-10-03</td>\n",
       "      <td>adams gerry</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Personal residence</td>\n",
       "      <td>Homeowner Only</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>...</td>\n",
       "      <td>9.090909</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Opposition</td>\n",
       "      <td>Tweets</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1948</td>\n",
       "      <td>Housing</td>\n",
       "      <td>0.611560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>617</th>\n",
       "      <td>adams gerry_2017_tweets_60</td>\n",
       "      <td>Crisis in Justice &amp; Gardai like the crisis hou...</td>\n",
       "      <td>2017</td>\n",
       "      <td>2017-11-23</td>\n",
       "      <td>adams gerry</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Personal residence</td>\n",
       "      <td>Homeowner Only</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>...</td>\n",
       "      <td>16.789039</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Opposition</td>\n",
       "      <td>Tweets</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1948</td>\n",
       "      <td>Housing</td>\n",
       "      <td>0.814124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>660</th>\n",
       "      <td>adams gerry_2015_tweets_77</td>\n",
       "      <td>All praise 4 Irish success but no mention of p...</td>\n",
       "      <td>2015</td>\n",
       "      <td>2015-04-30</td>\n",
       "      <td>adams gerry</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Personal residence</td>\n",
       "      <td>Homeowner Only</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>...</td>\n",
       "      <td>14.290310</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Opposition</td>\n",
       "      <td>Tweets</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1948</td>\n",
       "      <td>Housing</td>\n",
       "      <td>0.841938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>adams gerry_2016_tweets_125</td>\n",
       "      <td>Gerry Adams: New politics looks very like old ...</td>\n",
       "      <td>2016</td>\n",
       "      <td>2016-06-14</td>\n",
       "      <td>adams gerry</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Personal residence</td>\n",
       "      <td>Homeowner Only</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>Homeowner or Landlord</td>\n",
       "      <td>...</td>\n",
       "      <td>13.873526</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Opposition</td>\n",
       "      <td>Tweets</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>1948</td>\n",
       "      <td>Housing</td>\n",
       "      <td>0.956801</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 56 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     document_id  \\\n",
       "45     adams gerry_2014_tweets_6   \n",
       "476  adams gerry_2018_tweets_264   \n",
       "617   adams gerry_2017_tweets_60   \n",
       "660   adams gerry_2015_tweets_77   \n",
       "698  adams gerry_2016_tweets_125   \n",
       "\n",
       "                                                  text  year    fulldate  \\\n",
       "45   Government must get Terms of Reference on moth...  2014  2014-06-11   \n",
       "476             Homes For All! https://t.co/OAzypEGSop  2018  2018-10-03   \n",
       "617  Crisis in Justice & Gardai like the crisis hou...  2017  2017-11-23   \n",
       "660  All praise 4 Irish success but no mention of p...  2015  2015-04-30   \n",
       "698  Gerry Adams: New politics looks very like old ...  2016  2016-06-14   \n",
       "\n",
       "      harmonised  ownership     ownership_clean     ownership_3  \\\n",
       "45   adams gerry        1.0  Personal residence  Homeowner Only   \n",
       "476  adams gerry        1.0  Personal residence  Homeowner Only   \n",
       "617  adams gerry        1.0  Personal residence  Homeowner Only   \n",
       "660  adams gerry        1.0  Personal residence  Homeowner Only   \n",
       "698  adams gerry        1.0  Personal residence  Homeowner Only   \n",
       "\n",
       "               ownership_2        ownership_2_lag  ...  perc_change_median  \\\n",
       "45   Homeowner or Landlord  Homeowner or Landlord  ...            3.118387   \n",
       "476  Homeowner or Landlord  Homeowner or Landlord  ...            9.090909   \n",
       "617  Homeowner or Landlord  Homeowner or Landlord  ...           16.789039   \n",
       "660  Homeowner or Landlord  Homeowner or Landlord  ...           14.290310   \n",
       "698  Homeowner or Landlord  Homeowner or Landlord  ...           13.873526   \n",
       "\n",
       "     change_ownership change_became_owner     gov_opp    type title  \\\n",
       "45                0.0                 0.0  Opposition  Tweets  None   \n",
       "476               0.0                 0.0  Opposition  Tweets  None   \n",
       "617               0.0                 0.0  Opposition  Tweets  None   \n",
       "660               0.0                 0.0  Opposition  Tweets  None   \n",
       "698               0.0                 0.0  Opposition  Tweets  None   \n",
       "\n",
       "    respondent birth housing_bert     score  \n",
       "45        None  1948      Housing  0.929408  \n",
       "476       None  1948      Housing  0.611560  \n",
       "617       None  1948      Housing  0.814124  \n",
       "660       None  1948      Housing  0.841938  \n",
       "698       None  1948      Housing  0.956801  \n",
       "\n",
       "[5 rows x 56 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# filter only rows where housing_bert == \"Housing\"\n",
    "df_all_housing = df_all[df_all[\"housing_bert\"] == \"Housing\"]\n",
    "\n",
    "# get number of rows\n",
    "df_all_housing.shape\n",
    "\n",
    "# get first five rows\n",
    "df_all_housing.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# use the following line to get the data from the dataframe\n",
    "text_data = df_all_housing['text'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "# load model\n",
    "classifier_cardiff = pipeline(model='cardiffnlp/twitter-roberta-base-sentiment-latest',\n",
    "                              max_length = 512,\n",
    "                              batch_size = 64,\n",
    "                              device = device,\n",
    "                              padding = 'max_length')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The pipeline call took 8.20 minutes.\n"
     ]
    }
   ],
   "source": [
    "# start the timer\n",
    "start_time = time.time()\n",
    "\n",
    "# get sentence predictions and store them into a dictionary\n",
    "dat_classified_cardiff = pd.DataFrame(classifier_cardiff(text_data))\n",
    "\n",
    "# get the elapsed time\n",
    "end_time = time.time()\n",
    "elapsed_time = end_time - start_time\n",
    "\n",
    "# print the time it took to run the pipeline\n",
    "print(f\"The pipeline call took {elapsed_time/60:.2f} minutes.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rename label to sentiment_siebert\n",
    "dat_classified_cardiff.rename(columns = {'label': 'sentiment_cardiff'}, inplace = True)\n",
    "dat_classified_cardiff.rename(columns = {'score': 'score_cardiff'}, inplace = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# bind \"text\" variable and relevant docvars from df to dat_classified_cardiff\n",
    "dat_classified_cardiff['text_classified'] = df_all_housing['text'].tolist()\n",
    "dat_classified_cardiff['document_id'] = df_all_housing['document_id'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentiment_cardiff</th>\n",
       "      <th>score_cardiff</th>\n",
       "      <th>text_classified</th>\n",
       "      <th>document_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>neutral</td>\n",
       "      <td>0.878377</td>\n",
       "      <td>Government must get Terms of Reference on moth...</td>\n",
       "      <td>adams gerry_2014_tweets_6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>positive</td>\n",
       "      <td>0.723941</td>\n",
       "      <td>Homes For All! https://t.co/OAzypEGSop</td>\n",
       "      <td>adams gerry_2018_tweets_264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>negative</td>\n",
       "      <td>0.782479</td>\n",
       "      <td>Crisis in Justice &amp; Gardai like the crisis hou...</td>\n",
       "      <td>adams gerry_2017_tweets_60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>negative</td>\n",
       "      <td>0.580475</td>\n",
       "      <td>All praise 4 Irish success but no mention of p...</td>\n",
       "      <td>adams gerry_2015_tweets_77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>neutral</td>\n",
       "      <td>0.748234</td>\n",
       "      <td>Gerry Adams: New politics looks very like old ...</td>\n",
       "      <td>adams gerry_2016_tweets_125</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  sentiment_cardiff  score_cardiff  \\\n",
       "0           neutral       0.878377   \n",
       "1          positive       0.723941   \n",
       "2          negative       0.782479   \n",
       "3          negative       0.580475   \n",
       "4           neutral       0.748234   \n",
       "\n",
       "                                     text_classified  \\\n",
       "0  Government must get Terms of Reference on moth...   \n",
       "1             Homes For All! https://t.co/OAzypEGSop   \n",
       "2  Crisis in Justice & Gardai like the crisis hou...   \n",
       "3  All praise 4 Irish success but no mention of p...   \n",
       "4  Gerry Adams: New politics looks very like old ...   \n",
       "\n",
       "                   document_id  \n",
       "0    adams gerry_2014_tweets_6  \n",
       "1  adams gerry_2018_tweets_264  \n",
       "2   adams gerry_2017_tweets_60  \n",
       "3   adams gerry_2015_tweets_77  \n",
       "4  adams gerry_2016_tweets_125  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dat_classified_cardiff.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(53231, 4)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get number of rows\n",
    "dat_classified_cardiff.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# store as parquet file\n",
    "dat_classified_cardiff.to_parquet(\"data_dontshare/housing_content_sentiment_cardiff.parquet\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
